dna_meta <- read_tsv(here::here("metadata/DNA/vcf_samples_metadata.tsv") )
dna_meta_full <- dna_meta_full <- readxl::read_excel(here::here("metadata/raw_metadata_from_NYGC/ALS Consortium DNA Metadata 20201015 .xlsx")) %>% janitor::clean_names()
rna_meta <- read_tsv(here::here("metadata/RNA/rna_metadata_annotated.tsv"))
covariates <- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_splicing_peer20.gene.combined_covariates.txt")) %>%
gather(key = "sample", value = "value", -ID) %>%
spread(key = ID, value = value)
# expansionHunter calls are using external_sample_id
# genotype and others are using VCF ID
coloc_res <- read_tsv(here::here("data/QTLs/Nov_2020/results/COLOC/all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"))
#coloc_res <- read_tsv(here::here("data/COLOC/Nicolas_suggestive/all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"))
lsp <- filter(coloc_res, QTL == "NYGC_Lumbar_sQTL", QTL_Gene %in% c("C9orf72", "ATXN3")) %>%
filter(PP.H4.abf > 0.5)
lsp## # A tibble: 5 × 24
## disease GWAS locus GWAS_SNP GWAS_P GWAS_chr GWAS_pos QTL type QTL_SNP
## <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr>
## 1 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL chr14:…
## 2 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL rs2003…
## 3 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL rs2003…
## 4 ALS Nicol… locus… rs3849943 3.77e-30 chr9 27543384 NYGC… sQTL rs1537…
## 5 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL rs5810…
## # … with 14 more variables: QTL_P <dbl>, QTL_MAF <dbl>, QTL_chr <chr>,
## # QTL_pos <dbl>, QTL_junction <chr>, QTL_Gene <chr>, QTL_Ensembl <chr>,
## # nsnps <dbl>, PP.H0.abf <dbl>, PP.H1.abf <dbl>, PP.H2.abf <dbl>,
## # PP.H3.abf <dbl>, PP.H4.abf <dbl>, LD <dbl>
sqtl_junctions <- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_sQTL_junctions.bed") )
sample_key <- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_sample_key.txt") )
# 197 samples from 197 donors
# get expansionHunter calls for samples with RNA
c9_expansion_hunter_rna <- read.table(here::here("ALS_SC/ATXN3/ExpansionHunter_C9orf72_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# 403 donors with expansionHunter calls
# feb 2021 - 2044 samples with WGS
c9_expansion_hunter_wgs <- read.table(here::here("ALS_SC/ATXN3/02_2021_C9orf72_ExpansionHunter_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# 167 samples have maximum c9 repeat length > 30
# VCF for 434 samples
c9_vcf <- read.table(here::here("ALS_SC/ATXN3/C9_GWAS_SNP_rs3849943.vcf"), header=TRUE, stringsAsFactors = FALSE )
# R read T as TRUE
c9_vcf$ALT <- "T"
# rs3849943 - reference allele should be T, alternate should be C. C has an allele frequency around 20% worldwide
c9_genotypes <- process_vcf(c9_vcf) %>%
select(-genotype_binary)
# fix allele flip here
c9_code_df <- tibble(
genotype = c("1/1", "0/1", "0/0"),
genotype_code = factor(c("T/T", "T/C", "C/C"), levels = c("T/T", "T/C", "C/C") ),
genotype_binary = c(2,1,0)
)
c9_genotypes <- left_join(c9_genotypes, c9_code_df, by = "genotype")
c9_junctions <- process_junctions(sqtl_junctions, "ENSG00000147894.15")
# 11 junctions in 197 samples
#c9_eh_df_rna <- process_expansionhunter(c9_expansion_hunter_rna)
c9_eh_df<- process_expansionhunter(c9_expansion_hunter_wgs)
# 2041 calls, only 1 is NA
# merge together
c9_all <- left_join(c9_junctions, c9_genotypes, by = "sample") %>%
left_join(c9_eh_df, by = "sample") %>%
mutate(genotype_binary = factor(genotype_binary))
# 2167 rows for 197 samples
# extract top colocalised junction
# join on QTL covariates for those samples
c9_sqtl_snp <- c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
select(sample, junction_ratio, genotype_binary) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample") %>%
mutate(genotype_binary = as.numeric(genotype_binary))
# do linear regression on junction ratio against genotype - a splicing QTL in R!
summary(lm(junction_ratio ~ genotype_binary, data = c9_sqtl_snp))##
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = c9_sqtl_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4998 -0.4983 -0.1195 0.3538 2.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.27972 0.18316 6.987 4.33e-11 ***
## genotype_binary -0.54971 0.07375 -7.454 2.89e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6954 on 195 degrees of freedom
## Multiple R-squared: 0.2217, Adjusted R-squared: 0.2177
## F-statistic: 55.56 on 1 and 195 DF, p-value: 2.892e-12
# P = 1e-12
# now add all the same covariates as tensorQTL
summary(lm(junction_ratio ~ . , data = c9_sqtl_snp))##
## Call:
## lm(formula = junction_ratio ~ ., data = c9_sqtl_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19512 -0.44398 -0.04007 0.37916 2.37281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.879480 1.312373 1.432 0.15398
## genotype_binary -0.518285 0.080651 -6.426 1.31e-09 ***
## dna_platformX 0.214807 0.143021 1.502 0.13500
## `dna_prepPCR-Free` -0.075160 0.149767 -0.502 0.61643
## InferredCov1 -1.775086 1.441707 -1.231 0.21996
## InferredCov10 0.031426 1.070358 0.029 0.97661
## InferredCov11 -0.026260 1.090898 -0.024 0.98082
## InferredCov12 -0.002791 1.132687 -0.002 0.99804
## InferredCov13 -0.628309 1.137231 -0.552 0.58135
## InferredCov14 -0.075767 1.209412 -0.063 0.95012
## InferredCov15 0.945244 1.377216 0.686 0.49345
## InferredCov16 0.863256 1.473463 0.586 0.55875
## InferredCov17 -0.829829 1.614621 -0.514 0.60797
## InferredCov18 0.361519 1.639576 0.220 0.82575
## InferredCov19 1.282635 1.632608 0.786 0.43319
## InferredCov2 -2.081830 1.149401 -1.811 0.07190 .
## InferredCov20 1.502637 1.790155 0.839 0.40245
## InferredCov3 -0.851180 1.069428 -0.796 0.42721
## InferredCov4 2.972459 1.033283 2.877 0.00454 **
## InferredCov5 -0.613545 1.045911 -0.587 0.55826
## InferredCov6 -0.995302 1.039625 -0.957 0.33977
## InferredCov7 0.422511 1.084553 0.390 0.69735
## InferredCov8 -0.263301 1.050102 -0.251 0.80232
## InferredCov9 1.699327 1.064456 1.596 0.11228
## PC1 134.294938 78.607734 1.708 0.08942 .
## PC2 -68.869449 47.635260 -1.446 0.15012
## PC3 30.729341 24.764210 1.241 0.21639
## PC4 -6.440466 23.312600 -0.276 0.78269
## PC5 -1.668193 7.312508 -0.228 0.81983
## sexmale 0.079182 0.107156 0.739 0.46098
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6906 on 167 degrees of freedom
## Multiple R-squared: 0.3427, Adjusted R-squared: 0.2285
## F-statistic: 3.002 on 29 and 167 DF, p-value: 5.251e-06
# P = 1e-9
# get residuals
c9_sqtl_snp_resid <-
c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
select(sample, junction_ratio) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
c9_sqtl_snp_resid$resid <- resid(lm(junction_ratio ~ . , data = c9_sqtl_snp_resid) )
## then add back the SNP
c9_sqtl_snp_resid$genotype_binary <- c9_genotypes$genotype_binary[match(row.names(c9_sqtl_snp_resid), c9_genotypes$sample)]
## MAIN FIGURE
plot_c9_snp_splicing <-
c9_sqtl_snp_resid %>%
left_join(c9_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = resid)) +
geom_jitter(aes(colour = genotype_code), width = 0.2, height = 0, size = 1) +
geom_boxplot(outlier.color = NA, fill = NA) +
labs(y = "Residual splicing ratio", x = "genotype") +
#ggpubr::stat_compare_means() +
theme_jh() +
labs(colour = "rs3849943") +
scale_colour_brewer(type = "qual", palette = 2)
plot_c9_snp_splicingNow use C9orf72 expansion repeat
## Get Expansion Calls
c9_sqtl_repeat <- c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
select(sample, junction_ratio, max_call) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample") %>%
filter( !is.na(max_call))
#mutate(max_call = factor(max_call > 30))
# 139 out of 197 samples have ExpansionHunter genotypes
summary(lm(junction_ratio ~ log(max_call), data = c9_sqtl_repeat))##
## Call:
## lm(formula = junction_ratio ~ log(max_call), data = c9_sqtl_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4786 -0.4952 -0.1193 0.4479 2.4330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.46143 0.10919 -4.226 4.32e-05 ***
## log(max_call) 0.18748 0.03937 4.763 4.81e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7203 on 137 degrees of freedom
## Multiple R-squared: 0.142, Adjusted R-squared: 0.1358
## F-statistic: 22.68 on 1 and 137 DF, p-value: 4.806e-06
# P = 4.81e-06
summary(lm(junction_ratio ~ . , data = c9_sqtl_repeat))##
## Call:
## lm(formula = junction_ratio ~ ., data = c9_sqtl_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81084 -0.42999 -0.05215 0.42323 1.65273
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.848e+00 1.779e+00 1.038 0.30137
## max_call 1.669e-03 5.881e-04 2.838 0.00541 **
## dna_platformX 3.087e-01 1.653e-01 1.868 0.06442 .
## `dna_prepPCR-Free` NA NA NA NA
## InferredCov1 2.507e+00 2.749e+00 0.912 0.36372
## InferredCov10 -4.387e-02 1.432e+00 -0.031 0.97561
## InferredCov11 9.855e-01 1.492e+00 0.660 0.51040
## InferredCov12 -8.995e-01 1.704e+00 -0.528 0.59866
## InferredCov13 2.194e+00 1.540e+00 1.424 0.15715
## InferredCov14 2.767e+00 1.534e+00 1.804 0.07397 .
## InferredCov15 -1.256e+00 1.765e+00 -0.712 0.47813
## InferredCov16 2.716e-01 1.767e+00 0.154 0.87812
## InferredCov17 -1.950e+00 2.089e+00 -0.934 0.35250
## InferredCov18 1.630e+00 2.028e+00 0.804 0.42310
## InferredCov19 3.177e+00 2.072e+00 1.533 0.12806
## InferredCov2 -6.201e-01 1.775e+00 -0.349 0.72756
## InferredCov20 -4.452e-01 2.251e+00 -0.198 0.84358
## InferredCov3 -9.271e-01 1.364e+00 -0.679 0.49827
## InferredCov4 3.398e+00 1.337e+00 2.542 0.01240 *
## InferredCov5 -2.182e+00 1.464e+00 -1.491 0.13895
## InferredCov6 -4.505e-01 1.367e+00 -0.330 0.74230
## InferredCov7 4.207e-01 1.353e+00 0.311 0.75634
## InferredCov8 1.034e+00 1.356e+00 0.762 0.44774
## InferredCov9 1.954e+00 1.358e+00 1.439 0.15308
## PC1 8.573e+01 9.984e+01 0.859 0.39241
## PC2 -8.938e+01 6.088e+01 -1.468 0.14492
## PC3 2.506e+01 3.206e+01 0.782 0.43601
## PC4 7.387e+00 2.791e+01 0.265 0.79175
## PC5 -9.666e+00 9.648e+00 -1.002 0.31862
## sexmale 2.612e-02 1.427e-01 0.183 0.85513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7216 on 110 degrees of freedom
## Multiple R-squared: 0.3087, Adjusted R-squared: 0.1328
## F-statistic: 1.755 on 28 and 110 DF, p-value: 0.02119
# P = 0.005
# plot association between repeat length and residualised junction ratio
c9_sqtl_resid_eh <- c9_sqtl_snp_resid
c9_sqtl_resid_eh$max_call <- c9_sqtl_repeat$max_call[match(row.names(c9_sqtl_resid_eh), row.names(c9_sqtl_repeat) )]
c9_sqtl_resid_eh <- c9_sqtl_resid_eh[ !is.na(c9_sqtl_resid_eh$max_call), ]
# 139 samples
# plot repeat length against residualised splicing - MAIN FIGURE
plot_c9_repeat_splicing <-
c9_sqtl_resid_eh %>%
rownames_to_column(var = "sample") %>%
left_join(rna_meta, by = c("sample" = "external_sample_id") ) %>%
left_join( c9_code_df, by = "genotype_binary") %>%
ggplot(aes(x = max_call, y = resid)) +
geom_point(aes(colour = genotype_code), size = 1) +
geom_smooth(method = "lm", colour = "black") +
labs(x = "C9orf72 repeat length", y = "Residualised Splicing ratio") +
ggpubr::stat_cor() +
scale_x_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
labs(colour = "rs3849943") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
# colour by disease - for supplement
c9_sqtl_resid_eh %>%
rownames_to_column(var = "sample") %>%
left_join(sample_key, by = c("sample" = "participant_id")) %>%
left_join(rna_meta, by = c("sample_id" = "external_sample_id") ) %>%
left_join( c9_code_df, by = "genotype_binary") %>%
ggplot(aes(x = max_call, y = resid)) +
geom_point(aes(colour = disease), size = 0.8) +
geom_smooth(method = "lm", colour = "black") +
labs(x = "C9orf72 repeat length", y = "Residualised Splicing ratio") +
ggpubr::stat_cor() +
scale_x_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
theme_jh()c9_sqtl_resid_eh %>%
filter(!is.na(max_call)) %>%
ggplot(aes(x = max_call > 100, y = junction_ratio)) + geom_boxplot() + ggpubr::stat_compare_means() +
geom_jitter(width = 0.25) +
labs(x = "C9 repeat > 30", y = "Splicing ratio") +
theme_jh()c9_sqtl_resid_eh %>%
ggplot(aes(x = max_call, y = resid)) + geom_point() + geom_smooth(method = "lm") +
labs(x = "C9orf72 repeat length", y = "Residual splicing ratio")c9_sqtl_resid_eh %>%
filter(!is.na(max_call)) %>%
ggplot(aes(x = max_call > 100, y = resid)) + geom_boxplot() + ggpubr::stat_compare_means() +
geom_jitter(width = 0.25) +
labs(x = "C9 repeat > 100", y = "Residual splicing ratio")c9_sqtl_resid_eh %>%
ggplot(aes(x = genotype_binary, y = max_call)) + geom_point()chisq.test(table(expansion = c9_sqtl_resid_eh $max_call > 30, genotype = c9_sqtl_resid_eh$genotype_binary))##
## Pearson's Chi-squared test
##
## data: table(expansion = c9_sqtl_resid_eh$max_call > 30, genotype = c9_sqtl_resid_eh$genotype_binary)
## X-squared = 17.499, df = 2, p-value = 0.0001586
plot_c9_repeat_genotype <-
c9_sqtl_resid_eh %>%
rownames_to_column(var = "sample") %>%
#left_join(rna_meta, by = c("sample" = "external_sample_id") ) %>%
left_join( c9_code_df, by = "genotype_binary") %>%
ggplot(aes(y = max_call, x = genotype_code)) +
geom_jitter(aes(colour = genotype_code), size = 1, width = 0.2, height =0) +
geom_boxplot(outlier.color = NA, fill = NA) +
#geom_smooth(method = "lm", colour = "black") +
labs(y = "C9orf72 repeat length", x = "genotype") +
#ggpubr::stat_cor() +
scale_y_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
labs(colour = "rs3849943") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)c9_multiplot <-
(plot_c9_snp_splicing + plot_c9_repeat_genotype ) /
plot_c9_repeat_splicing &
# theme(axis.ticks.y = element_blank(),
# axis.text.y = element_blank(),
# axis.title.y = element_blank(),
# axis.line.y = element_blank() ) &
# ylim(-1.6,2.6)
# ) +
#plot_layout(nrow = 2, guides = "collect", widths =c(1,2,1)) &
guides(colour = FALSE) #&
#theme(legend.position = "bottom") &
#plot_annotation(title = "rs3849943 - C9orf72 splicing", subtitle = "chr9:27567164-27573709") &
#ylim(-1.6,2.6)
c9_multiplot ggsave(plot = c9_multiplot, filename = here::here("ALS_SC/plots/C9orf72_genotype_repeat_plots.pdf"), width = 3, height = 4.5) 1 out of 60 homozygous ref have expansions 11 out of 50 are heterozygous 5 out of 14 homozygous alt have expansions
therefore there’s clearly a tagging of the repeat by the SNP and vice versa.
C9orf72 repeat length is weakly associated with splicing of the first intron - P = 0.059 (residual ratio)
# plot
c9_all %>%
filter(junction_id == "chr9:27567164-27573709") %>%
ggplot(aes(x = genotype_binary, y = junction_ratio)) + geom_boxplot(aes( group = genotype_binary)) +
facet_wrap(~junction_id)c9_all %>%
filter(!is.na(max_call)) %>%
filter(junction_id == "chr9:27567164-27573709") %>%
ggplot(aes(x = genotype_binary, y = junction_ratio)) +
#geom_boxplot(aes( group = max_call > 80), outlier.colour = NA) +
geom_jitter(aes(colour = max_call > 80)) +
facet_wrap(~junction_id)#chisq.test(table(c9_genotype_eh$genotype_binary, c9_genotype_eh$max_call))
c9_all %>%
filter(!is.na(max_call)) %>%
filter(junction_id == "chr9:27567164-27573709") %>%
mutate(genotype_binary = factor(genotype_binary)) %>%
ggplot(aes(x = max_call > 80, y = junction_ratio)) + geom_boxplot() +
facet_wrap(~junction_id)c9_all %>%
select(sample, genotype_binary, max_call) %>%
filter(!is.na(max_call)) %>%
distinct() %>%
ggplot(aes(x = genotype_binary, y = max_call)) + geom_point()c9_genotype_eh <- inner_join(c9_genotypes, c9_eh_df, by = "sample")
table(SNP_genotype = c9_genotype_eh$genotype_binary, expansion = c9_genotype_eh$max_call > 80)## expansion
## SNP_genotype FALSE TRUE
## 0 18 9
## 1 87 32
## 2 119 1
c9_genotype_eh %>%
ggplot(aes(x = genotype_binary, y = max_call)) +
geom_jitter(height = 0, width = 0.25)c9_model <-
c9_all %>%
filter(!is.na(max_call)) %>%
filter(junction_id == "chr9:27567164-27573709")
summary(lm(junction_ratio ~ as.numeric(genotype_binary), data = c9_model))##
## Call:
## lm(formula = junction_ratio ~ as.numeric(genotype_binary), data = c9_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43377 -0.50194 -0.08515 0.41028 2.90805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.18285 0.20476 5.777 4.89e-08 ***
## as.numeric(genotype_binary) -0.51893 0.08395 -6.182 6.82e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6877 on 137 degrees of freedom
## Multiple R-squared: 0.2181, Adjusted R-squared: 0.2124
## F-statistic: 38.21 on 1 and 137 DF, p-value: 6.823e-09
summary(lm(junction_ratio ~ as.numeric(max_call), data = c9_model))##
## Call:
## lm(formula = junction_ratio ~ as.numeric(max_call), data = c9_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19386 -0.54406 -0.09168 0.42007 2.68533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1091423 0.0702518 -1.554 0.12259
## as.numeric(max_call) 0.0014902 0.0005401 2.759 0.00659 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.757 on 137 degrees of freedom
## Multiple R-squared: 0.05264, Adjusted R-squared: 0.04573
## F-statistic: 7.612 on 1 and 137 DF, p-value: 0.006589
summary(lm(junction_ratio ~ as.numeric(max_call > 80) + as.numeric(genotype_binary), data = c9_model))##
## Call:
## lm(formula = junction_ratio ~ as.numeric(max_call > 80) + as.numeric(genotype_binary),
## data = c9_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4011 -0.4920 -0.1165 0.4278 2.8392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.13505 0.22769 4.985 1.85e-06 ***
## as.numeric(max_call > 80) 0.08633 0.17770 0.486 0.628
## as.numeric(genotype_binary) -0.50379 0.08976 -5.613 1.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6896 on 136 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.208
## F-statistic: 19.12 on 2 and 136 DF, p-value: 4.823e-08
c9_model %>%
ggplot(aes( x = max_call > 80, y = junction_ratio)) + geom_boxplot() + ggpubr::stat_compare_means()Yes, the GWAS variant tags the expansion.
However, does the expansion cause the splicing change? Effect is weaker than that of the SNP.
Is there an enrichment in expanded repeat alleles in ALS patients compared to controls? Obviously yes.
nrow(c9_eh_df)## [1] 2041
# 2041 samples have EH run on them
setdiff(c9_eh_df$sample, dna_meta_full$external_sample_id)## [1] "HDA1368" "HDA1362" "HDA1360" "HDA1364" "HDA1359" "HDA1366" "HDA1367"
## [8] "HDA1363" "HDA1361"
c9_eh_meta <- c9_eh_df %>%
left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
filter(!is.na(external_subject_id))
dim(c9_eh_meta)## [1] 2032 63
# 2032
c9_eh_meta <- c9_eh_meta %>%
filter(!duplicated(external_subject_id)) %>%# throw out duplicate donors
filter(pct_european >= 0.9) # remove admixed samples - quick and dirty for now until I get full MDS values.
dim(c9_eh_meta)## [1] 1773 63
# 1773 donors
c9_eh_meta <- mutate(c9_eh_meta, disease = case_when(
subject_group_subcategory == "Classical/Typical ALS" ~ "ALS",
subject_group_subcategory == "Classical/Typical ALS, Frontotemporal Dementia (FTD)" ~ "ALS-FTD",
grepl("Frontotemporal Dementia", subject_group_subcategory) ~ "FTD",
grepl("Non-Neurological Control", subject_group) ~ "Control",
TRUE ~ "Other")
)
disease_n <- group_by(c9_eh_meta, disease) %>% tally() %>% mutate(disease_n = paste0(disease,"\n(", n, ")") )
disease_n$disease <- factor(disease_n$disease, levels = c("Control", "ALS", "ALS-FTD", "FTD", "Other") )
disease_n$disease_n <- factor(disease_n$disease_n, levels = disease_n$disease_n[as.numeric(disease_n$disease)])
c9_eh_meta <- left_join(c9_eh_meta, disease_n, by = "disease")
# expansion size
c9_dist_plot <- c9_eh_meta %>%
filter(disease != "Other") %>%
filter(!is.na(disease) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = max_call )) +
geom_violin(fill = NA, colour = "darkorange") +
geom_jitter(height = 0, width = 0.25, size = 0.5) +
coord_flip() +
scale_y_continuous(trans = 'log10') +
geom_hline(yintercept = 30, linetype = 3, colour = "red") + theme_jh() +
labs(x = "", y = "C9orf72 expansion max size")
# proportion of group with expansion > 30
c9_prop_plot <- c9_eh_meta %>%
filter(!is.na(disease) ) %>%
filter(disease != "Other") %>%
group_by(disease_n, expand = !is.na(max_call) & max_call > 30) %>%
tally() %>%
pivot_wider(names_from = expand, values_from = n, values_fill = 0, names_prefix = "c9") %>%
mutate(prop = c9TRUE / (c9TRUE + c9FALSE) ) %>%
mutate(prop_label = paste0(signif(prop * 100, 2), "%" ) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = prop)) +
geom_col() +
coord_flip() +
labs(y = "Proportion > 30 repeats", x = "") +
theme_jh() +
scale_y_continuous(labels = scales::percent, minor_breaks = NULL, breaks = c(0,0.2), limits = c(0,0.4)) +
geom_text(aes(label = prop_label ), colour = "black", nudge_y = 0.04, size = 3)
c9_dist_plot + c9_prop_plot + plot_layout(ncol = 2, widths = c(3,1))ALS-FTD has highest proportion of individuals with >30 repeats
c9_eh_meta %>%
filter(!is.na(disease) ) %>%
filter(disease != "Other") %>%
# filter(max_call <= 30) %>%
group_by(disease, call = max_call > 30) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "c9_30_" ) %>%
mutate(prop = c9_30_TRUE / (c9_30_TRUE + c9_30_FALSE) )## # A tibble: 4 × 4
## # Groups: disease [4]
## disease c9_30_FALSE c9_30_TRUE prop
## <chr> <int> <int> <dbl>
## 1 ALS 998 103 0.0936
## 2 ALS-FTD 57 16 0.219
## 3 Control 229 1 0.00435
## 4 FTD 89 17 0.160
# subthreshold effects?
c9_eh_meta %>%
filter(!is.na(disease) ) %>%
filter(max_call <= 30) %>%
mutate(call_sum = call1 + call2 ) %>%
ggplot(aes(x = max_call)) + geom_histogram(aes(group = disease), binwidth = 1) +
facet_wrap(~disease_n, scales = "free_y", ncol = 1) +
theme_jh() +
labs(x = "C9orf72 repeat length", title = "C8orf72")c9_eh_meta %>%
filter(!is.na(disease) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
filter(max_call <= 30) %>%
mutate(call_sum = call1 + call2 ) %>%
ggplot(aes(x = disease_n, y = max_call)) +
coord_flip() +
geom_violin(fill = NA, colour = "darkorange") +
geom_jitter(width = 0.25, height = 0, size = 0.5) +#geom_histogram(aes(group = disease, fill = disease), binwidth = 1) +
ggpubr::stat_compare_means(ref.group = unique(c9_eh_meta$disease_n[c9_eh_meta$disease == "Control"]), label = "p.format", label.y.npc = 0.9, label.x.npc = 0.1 ) +
#facet_wrap(~disease_n, scales = "free_y") +
theme_jh() +
labs(title = "C9orf72", x = "Disease", y = "Expansion length")c9_eh_meta %>%
filter(!is.na(disease) ) %>%
filter(max_call <= 30) %>%
group_by(disease, call = max_call > 10) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "c9_10_" ) %>%
mutate(prop = c9_10_TRUE / (c9_10_TRUE + c9_10_FALSE) )## # A tibble: 5 × 4
## # Groups: disease [5]
## disease c9_10_FALSE c9_10_TRUE prop
## <chr> <int> <int> <dbl>
## 1 ALS 870 128 0.128
## 2 ALS-FTD 48 9 0.158
## 3 Control 202 27 0.118
## 4 FTD 82 7 0.0787
## 5 Other 225 22 0.0891
Yes, ALS and FTD cases are enriched for C9 expansions > 30. No difference in proportion of sub-30 expansion lengths between diseases.
all_meta <- left_join(rna_meta, dna_meta, by = "external_subject_id", suffix = c(".rna", ".dna"))
disease_meta <- all_meta %>% select(sample = vcf_id, disease)
# get expansionHunter calls
#a3_expansion_hunter <- read.table(here::here("ALS_SC/a3/ExpansionHunter_a3_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# Feb 2021 - 1877 samples
a3_expansion_hunter <- read.table(here::here("ALS_SC/ATXN3/02_2021_ATXN3_ExpansionHunter_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# VCF
#a3_vcf <- read.table(here::here("ALS_SC/ATXN3/ATXN3_QTL_SNP_rs10150068.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_vcf <- read.table(here::here("ALS_SC/ATXN3/ALS_rs10143310.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_qtl_vcf <- read.table(here::here("ALS_SC/ATXN3/ALS_rs200388434.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_genotypes <- process_vcf(a3_vcf, ID = "rs10143310")
a3_qtl_geno <- process_vcf(a3_qtl_vcf , ID = "rs200388434")
# 425 donors# compare GWAS and QTL SNPs
a3_compare <- left_join(a3_genotypes, a3_qtl_geno, by = c("sample") ) %>%
left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
filter(pct_european > 0.9) %>%
select(A = genotype_binary.x, B = genotype_binary.y) %>%
drop_na()a3_junctions <- process_junctions(sqtl_junctions, "ENSG00000066427.22")
# 17 junctions
a3_eh_df <- process_expansionhunter(a3_expansion_hunter) %>% left_join(disease_meta, by = "sample")
# repeats for 2781 donors
# is there any enrichment in a3 length in ALS?# merge together
a3_all <- left_join(a3_junctions, a3_genotypes, by = "sample") %>%
left_join(a3_eh_df, by = "sample") %>%
#mutate(genotype_binary) %>%
left_join(disease_meta) %>%
filter(!is.na(genotype_binary))
# a3_all %>%
# filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
# ggplot(aes(x = genotype_binary, y = junction_ratio)) + geom_boxplot(aes( group = genotype_binary)) +
# facet_wrap(~junction_id)
# a3_all %>%
# filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341", !is.na(max_call)) %>%
# ggplot(aes(x = max_call, y = junction_ratio)) + geom_point() + ggpubr::stat_cor()
#
a3_genotype_eh <- inner_join(a3_genotypes, a3_eh_df, by = "sample") %>%
distinct()
#
# a3_genotype_eh %>%
# filter(!is.na(genotype_binary)) %>%
# ggplot(aes(x = max_call) ) +
# geom_dotplot(aes(fill = as.factor(genotype_binary)))
a3_snp <-
a3_all %>%
filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
select(sample, junction_ratio, genotype_binary) %>%
distinct() %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
a3_code_df <-
tibble(
genotype_binary = c(0,1,2),
genotype_code = factor( c("G/G","G/A", "A/A"), levels = c("G/G","G/A", "A/A") )
)
a3_sqtl_snp_resid <-
a3_all %>%
filter(junction_id == "chr14:92049814-92050341") %>%
select(sample, junction_ratio) %>%
distinct() %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
a3_sqtl_snp_resid$resid <- resid(lm(junction_ratio ~ . , data = a3_sqtl_snp_resid) )
## then add back the SNP
a3_sqtl_snp_resid$genotype_binary <- a3_genotypes$genotype_binary[match(row.names(a3_sqtl_snp_resid), a3_genotypes$sample)]
a3_sqtl_snp_resid %>%
left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = junction_ratio)) +
geom_boxplot(aes(group = genotype_code), outlier.colour = NA) +
geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0) +
labs(y = "Raw junction ratio", x = "rs10143310", colour = "rs10143310") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2) +
ggpubr::stat_compare_means()## plot residualised junctions against genotype
plot_a3_sqtl <-
a3_sqtl_snp_resid %>%
left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = resid)) +
geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0, size = 1) +
geom_boxplot(aes(group = genotype_code), outlier.colour = NA, fill = NA) +
labs(y = "Residual splicing ratio", x = "rs10143310", colour = "rs10143310") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
plot_a3_sqtlsummary(lm(junction_ratio ~ genotype_binary, a3_sqtl_snp_resid ))##
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = a3_sqtl_snp_resid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88322 -0.70638 -0.05576 0.49782 2.58823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.42893 0.08127 -5.278 3.48e-07 ***
## genotype_binary 0.83434 0.10703 7.795 3.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8882 on 194 degrees of freedom
## Multiple R-squared: 0.2385, Adjusted R-squared: 0.2346
## F-statistic: 60.77 on 1 and 194 DF, p-value: 3.827e-13
# P = 3e-13
summary(lm(resid ~ genotype_binary, a3_sqtl_snp_resid ))##
## Call:
## lm(formula = resid ~ genotype_binary, data = a3_sqtl_snp_resid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97715 -0.45024 -0.08418 0.46492 2.35687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.31015 0.06600 -4.699 4.94e-06 ***
## genotype_binary 0.65364 0.08692 7.520 1.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7214 on 194 degrees of freedom
## Multiple R-squared: 0.2257, Adjusted R-squared: 0.2217
## F-statistic: 56.55 on 1 and 194 DF, p-value: 1.984e-12
# P = 1.98e-12summary(lm( junction_ratio ~ genotype_binary, data = a3_snp))##
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = a3_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88322 -0.70638 -0.05576 0.49782 2.58823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.42893 0.08127 -5.278 3.48e-07 ***
## genotype_binary 0.83434 0.10703 7.795 3.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8882 on 194 degrees of freedom
## Multiple R-squared: 0.2385, Adjusted R-squared: 0.2346
## F-statistic: 60.77 on 1 and 194 DF, p-value: 3.827e-13
# genotype P = 1e-13, R2 = 0.238
summary(lm( junction_ratio ~ ., data = a3_snp))##
## Call:
## lm(formula = junction_ratio ~ ., data = a3_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8694 -0.4728 -0.0442 0.3901 2.1971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.52529 1.45500 -0.361 0.71854
## genotype_binary 0.78999 0.10011 7.891 3.84e-13 ***
## dna_platformX -0.11409 0.15641 -0.729 0.46677
## `dna_prepPCR-Free` 0.36955 0.16514 2.238 0.02656 *
## InferredCov1 2.34263 1.57920 1.483 0.13986
## InferredCov10 0.18446 1.18789 0.155 0.87678
## InferredCov11 -0.58516 1.20568 -0.485 0.62808
## InferredCov12 -1.54656 1.24150 -1.246 0.21462
## InferredCov13 1.88385 1.24587 1.512 0.13242
## InferredCov14 0.21401 1.32325 0.162 0.87172
## InferredCov15 -0.84822 1.49430 -0.568 0.57105
## InferredCov16 0.53320 1.63746 0.326 0.74512
## InferredCov17 0.77591 1.77928 0.436 0.66335
## InferredCov18 -0.04472 1.79702 -0.025 0.98018
## InferredCov19 -0.52204 1.79051 -0.292 0.77099
## InferredCov2 -8.49275 1.27234 -6.675 3.54e-10 ***
## InferredCov20 -0.87916 1.95161 -0.450 0.65295
## InferredCov3 -2.74220 1.17395 -2.336 0.02069 *
## InferredCov4 2.07475 1.13359 1.830 0.06901 .
## InferredCov5 -3.52671 1.14608 -3.077 0.00244 **
## InferredCov6 -3.12124 1.14236 -2.732 0.00697 **
## InferredCov7 -1.08050 1.21000 -0.893 0.37316
## InferredCov8 1.82286 1.14915 1.586 0.11458
## InferredCov9 -0.87924 1.16650 -0.754 0.45207
## PC1 39.10449 86.02630 0.455 0.65002
## PC2 -21.68610 52.15079 -0.416 0.67807
## PC3 21.00597 27.12694 0.774 0.43982
## PC4 -20.22852 25.49876 -0.793 0.42873
## PC5 11.96327 8.11010 1.475 0.14208
## sexmale -0.10658 0.11709 -0.910 0.36403
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7557 on 166 degrees of freedom
## Multiple R-squared: 0.5283, Adjusted R-squared: 0.4459
## F-statistic: 6.412 on 29 and 166 DF, p-value: 1.852e-15
# genotype P = 1e-14, R2 = 0.444
summary(lm( resid ~ genotype_binary, data = a3_sqtl_snp_resid))##
## Call:
## lm(formula = resid ~ genotype_binary, data = a3_sqtl_snp_resid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97715 -0.45024 -0.08418 0.46492 2.35687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.31015 0.06600 -4.699 4.94e-06 ***
## genotype_binary 0.65364 0.08692 7.520 1.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7214 on 194 degrees of freedom
## Multiple R-squared: 0.2257, Adjusted R-squared: 0.2217
## F-statistic: 56.55 on 1 and 194 DF, p-value: 1.984e-12
#
a3_repeat <-
a3_all %>%
filter(!is.na(max_call), junction_id == "chr14:92049814-92050341") %>%
distinct() %>%
select(sample, junction_ratio, max_call) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
summary(lm( junction_ratio ~ max_call, data = a3_repeat))##
## Call:
## lm(formula = junction_ratio ~ max_call, data = a3_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8628 -0.7984 -0.1966 0.6860 2.9399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.63611 0.33340 -1.908 0.0586 .
## max_call 0.03832 0.01735 2.209 0.0290 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.054 on 128 degrees of freedom
## Multiple R-squared: 0.03671, Adjusted R-squared: 0.02919
## F-statistic: 4.878 on 1 and 128 DF, p-value: 0.02897
# genotype P = 0.046, R2 = 0.026
summary(lm( junction_ratio ~ ., data = a3_repeat))##
## Call:
## lm(formula = junction_ratio ~ ., data = a3_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.64314 -0.50899 -0.05018 0.45904 2.54844
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.02694 2.41499 -1.667 0.0985 .
## max_call 0.03555 0.01613 2.204 0.0298 *
## dna_platformX -0.14620 0.21451 -0.682 0.4971
## `dna_prepPCR-Free` NA NA NA NA
## InferredCov1 1.95960 3.59738 0.545 0.5871
## InferredCov10 3.33540 1.90467 1.751 0.0830 .
## InferredCov11 -1.53272 1.92088 -0.798 0.4268
## InferredCov12 -4.23302 2.19530 -1.928 0.0566 .
## InferredCov13 2.70406 2.00756 1.347 0.1810
## InferredCov14 -0.13077 2.08099 -0.063 0.9500
## InferredCov15 -2.79994 2.28486 -1.225 0.2233
## InferredCov16 4.26529 2.26107 1.886 0.0621 .
## InferredCov17 2.83138 2.69523 1.051 0.2960
## InferredCov18 -0.13412 2.67375 -0.050 0.9601
## InferredCov19 -0.65843 2.79904 -0.235 0.8145
## InferredCov2 -11.03794 2.30632 -4.786 5.83e-06 ***
## InferredCov20 -2.90635 2.92381 -0.994 0.3226
## InferredCov3 -1.71432 1.78025 -0.963 0.3379
## InferredCov4 0.92645 1.84165 0.503 0.6160
## InferredCov5 -1.40300 2.00862 -0.698 0.4865
## InferredCov6 -4.36149 1.81656 -2.401 0.0182 *
## InferredCov7 -2.20735 1.73941 -1.269 0.2073
## InferredCov8 1.68239 1.76044 0.956 0.3415
## InferredCov9 -1.17957 1.98588 -0.594 0.5539
## PC1 149.42627 131.44222 1.137 0.2583
## PC2 70.84372 79.28267 0.894 0.3737
## PC3 8.92917 41.82913 0.213 0.8314
## PC4 -25.45228 36.91321 -0.690 0.4921
## PC5 -24.77207 12.78642 -1.937 0.0555 .
## sexmale -0.16643 0.18510 -0.899 0.3707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9061 on 101 degrees of freedom
## Multiple R-squared: 0.4383, Adjusted R-squared: 0.2826
## F-statistic: 2.815 on 28 and 101 DF, p-value: 8.393e-05
# genotype P = 0.0273, R2 = 0.285# geom_density(aes(group = genotype_binary))
a3_repeat$resid <- a3_sqtl_snp_resid$resid[match(row.names(a3_repeat), row.names(a3_sqtl_snp_resid) )]
summary(lm( resid ~ max_call, data = a3_repeat))##
## Call:
## lm(formula = resid ~ max_call, data = a3_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35611 -0.59474 -0.08955 0.45327 2.68632
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.47899 0.26657 -1.797 0.0747 .
## max_call 0.02739 0.01387 1.974 0.0505 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8427 on 128 degrees of freedom
## Multiple R-squared: 0.02955, Adjusted R-squared: 0.02197
## F-statistic: 3.898 on 1 and 128 DF, p-value: 0.05049
# P = 0.046
a3_repeat_snp <-
a3_all %>%
filter(!is.na(max_call), !is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
distinct() %>%
select(sample, junction_ratio, max_call, genotype_binary) %>%
left_join(covariates, by = "sample") %>%
mutate( genotype_binary = as.numeric(genotype_binary)) %>%
column_to_rownames(var = "sample")
summary(lm( junction_ratio ~ max_call + genotype_binary, data = a3_repeat_snp))##
## Call:
## lm(formula = junction_ratio ~ max_call + genotype_binary, data = a3_repeat_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91109 -0.71888 -0.02104 0.57295 2.46989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.488248 0.285444 -1.710 0.0896 .
## max_call 0.003349 0.015641 0.214 0.8308
## genotype_binary 0.951719 0.136530 6.971 1.53e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8999 on 127 degrees of freedom
## Multiple R-squared: 0.3033, Adjusted R-squared: 0.2923
## F-statistic: 27.64 on 2 and 127 DF, p-value: 1.082e-10
summary(lm( junction_ratio ~ ., data = a3_repeat_snp))##
## Call:
## lm(formula = junction_ratio ~ ., data = a3_repeat_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85915 -0.41640 -0.03891 0.41260 2.06728
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.530786 2.064561 -0.741 0.4602
## max_call 0.005141 0.014323 0.359 0.7204
## genotype_binary 0.855057 0.130313 6.562 2.38e-09 ***
## dna_platformX -0.119243 0.180287 -0.661 0.5099
## `dna_prepPCR-Free` NA NA NA NA
## InferredCov1 2.402813 3.023469 0.795 0.4287
## InferredCov10 2.217079 1.609453 1.378 0.1714
## InferredCov11 -1.807002 1.614571 -1.119 0.2657
## InferredCov12 -2.944301 1.855034 -1.587 0.1156
## InferredCov13 2.432729 1.687371 1.442 0.1525
## InferredCov14 -0.002418 1.748669 -0.001 0.9989
## InferredCov15 -1.328956 1.932905 -0.688 0.4933
## InferredCov16 1.042261 1.962349 0.531 0.5965
## InferredCov17 1.093326 2.280116 0.480 0.6326
## InferredCov18 -0.758996 2.248648 -0.338 0.7364
## InferredCov19 -1.352027 2.354280 -0.574 0.5671
## InferredCov2 -9.742614 1.947926 -5.002 2.43e-06 ***
## InferredCov20 -1.206587 2.470361 -0.488 0.6263
## InferredCov3 -2.964736 1.507954 -1.966 0.0521 .
## InferredCov4 1.492236 1.549857 0.963 0.3380
## InferredCov5 -1.951706 1.689826 -1.155 0.2509
## InferredCov6 -3.819686 1.528603 -2.499 0.0141 *
## InferredCov7 -1.137398 1.470617 -0.773 0.4411
## InferredCov8 1.535721 1.479388 1.038 0.3017
## InferredCov9 -0.096022 1.676796 -0.057 0.9544
## PC1 88.829067 110.830350 0.801 0.4248
## PC2 8.646023 67.288632 0.128 0.8980
## PC3 13.721574 35.154703 0.390 0.6971
## PC4 -21.980206 31.020998 -0.709 0.4802
## PC5 -7.859754 11.048695 -0.711 0.4785
## sexmale -0.236334 0.155896 -1.516 0.1327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7614 on 100 degrees of freedom
## Multiple R-squared: 0.6074, Adjusted R-squared: 0.4935
## F-statistic: 5.334 on 29 and 100 DF, p-value: 1.62e-10
# genotype P = 1e-8
# repeat P = 0.5
a3_to_plot <- a3_repeat_snp
a3_to_plot$resid <- a3_sqtl_snp_resid$resid[ match(row.names(a3_to_plot), row.names(a3_sqtl_snp_resid) ) ]
plot_repeat_splicing <-
a3_to_plot %>%
left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = max_call, y = resid)) +
geom_point(aes(colour = genotype_code), size = 1) +
geom_smooth(method = "lm", colour = "black") +
theme_jh() +
ggpubr::stat_cor() +
scale_colour_brewer(type = "qual", palette = 2) +
labs(y = "Residual splicing ratio", x = "CAG repeat length", colour = "rs10143310") +
scale_x_continuous(breaks = c(0,10,20,30), limits = c(0,30))
plot_repeat_splicing cor.test(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call)##
## Pearson's product-moment correlation
##
## data: a3_repeat_snp$genotype_binary and a3_repeat_snp$max_call
## t = 3.8318, df = 128, p-value = 0.0001984
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1572889 0.4671691
## sample estimates:
## cor
## 0.3207872
chisq.test(table(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call > 16))##
## Pearson's Chi-squared test
##
## data: table(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call > 16)
## X-squared = 31.994, df = 2, p-value = 1.129e-07
plot_a3_genotype_repeat <-
a3_repeat_snp %>%
left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = max_call) ) +
geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0, size = 1) +
geom_boxplot(outlier.colour = NA, aes(group = genotype_binary), fill = NA) +
scale_y_continuous(breaks = c(0,10,20,30), limits = c(0,30)) +
labs(y = "CAG repeat length", x = "rs10143310", colour = "rs10143310") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
atxn3_multiplot <-
(plot_a3_sqtl +plot_a3_genotype_repeat) /
plot_repeat_splicing &
theme(axis.ticks = element_line(colour = "black")) &
# theme(axis.ticks.y = element_blank(),
# axis.text.y = element_blank(),
# axis.title.y = element_blank(),
# axis.line.y = element_blank() ) &
guides(colour = FALSE)
atxn3_multiplotggsave(plot = atxn3_multiplot, filename = here::here("ALS_SC/plots/ATXN3_genotype_repeat_plots.pdf"), width = 3,
height = 4.5) The lead QTL SNP for the ATX3 sQTL is tagging the repeat expansion!
Questions:
Does including all the covariates increase the signal?
If the SNP is tagging the expansion, and the expansion affects splicing, then the expansion should correlate better with the splicing change than the tagging SNP, no?
## DNA cohort as well
a3_eh_meta <- a3_eh_df %>%
left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
filter(!is.na(external_subject_id)) %>%
filter(!duplicated(external_subject_id)) %>%
filter(!is.na(max_call))
# throw out duplicate donors
# 1869 unique donors from 2774 samples
a3_eh_meta <- a3_eh_meta %>%
filter(pct_european >= 0.9) # remove admixed samples - quick and dirty for now until I get full MDS values.
# 1634 Europeans
# I can only match metadata for 1559 non-duplicate donors. Time to update metadata? Emailed Nadia and Jennifer
a3_eh_meta <- mutate(a3_eh_meta, disease = case_when(
subject_group_subcategory == "Classical/Typical ALS" ~ "ALS",
subject_group_subcategory == "Classical/Typical ALS, Frontotemporal Dementia (FTD)" ~ "ALS-FTD",
grepl("Frontotemporal Dementia", subject_group_subcategory) ~ "FTD",
grepl("Non-Neurological Control", subject_group) ~ "Control",
TRUE ~ "Other")
)
# 1600 donors
disease_n <- group_by(a3_eh_meta, disease) %>% tally() %>% mutate(disease_n = paste0(disease,"\n(", n, ")") )
disease_n$disease <- factor(disease_n$disease, levels = c("Control", "ALS", "FTD", "ALS-FTD", "Other") )
disease_n$disease_n <- factor(disease_n$disease_n, levels = disease_n$disease_n[as.numeric(disease_n$disease)])
a3_eh_meta <- left_join(a3_eh_meta, disease_n, by = "disease")
# a3_eh_meta %>%
# filter(!is.na(disease)) %>%
# ggplot(aes(x = disease_n, y = max_call)) + geom_violin(fill = NA) + geom_jitter(width = 0.25, height = 0, size = 0.5, colour = "gray") +
# #coord_flip() +
# ggpubr::stat_compare_means(ref.group = disease_n$disease_n[2]) +
# theme_jh() + labs(y ="Max repeat length", title = "ATXN3", x = "")
#
#
# a3_eh_meta %>%
# # filter(disease %in% c("ALS", "Control")) %>%
# ggplot(aes(x = call1 + call2 ) ) + geom_histogram(aes(fill = disease), binwidth = 1) +
# facet_wrap(~disease, scales = "free_y", nrow = 4)
a3_threshold <- 16
a3_dist_plot <-
a3_eh_meta %>%
filter(disease %in% c("ALS", "Control")) %>%
filter(!is.na(disease) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = max_call )) +
geom_violin(fill = NA, colour = "darkorange") +
geom_jitter(height = 0, width = 0.25, size = 0.5) +
coord_flip() +
geom_hline(yintercept = a3_threshold, linetype = 3, colour = "red") + theme_jh() +
labs(x = "", y = "ATXN3 expansion max size")
# proportion of group with expansion > 16
a3_prop_plot <- a3_eh_meta %>% filter(!is.na(disease) ) %>%
filter(disease %in% c("ALS", "Control")) %>%
group_by(disease_n, expand = !is.na(max_call) & max_call > a3_threshold) %>%
tally() %>%
pivot_wider(names_from = expand, values_from = n, values_fill = 0, names_prefix = "a3") %>%
mutate(prop = a3TRUE / (a3TRUE + a3FALSE) ) %>%
mutate(prop_label = paste0(signif(prop * 100, 2), "%" ) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = prop)) +
geom_col() +
coord_flip() +
labs(y = "Proportion > 16 repeats", x = "") +
theme_jh() +
scale_y_continuous(labels = scales::percent, minor_breaks = NULL, breaks = c(0,0.2), limits = c(0,0.25)) +
geom_text(aes(label = prop_label ), colour = "black", nudge_y = 0.04, size = 3)
# proportion European in each sample by disease
# FTD slightly higher distribution - almost all from UK, less diverse than other diease groups
a3_eh_meta %>%
ggplot(aes(x = disease, y = as.numeric(pct_european) )) + geom_jitter() + geom_violin(fill = NA) + theme_jh() +
ggpubr::stat_compare_means(ref.group = "Control")# table
a3_eh_meta %>%
filter(!is.na(disease) ) %>%
# filter(max_call <= 30) %>%
group_by(disease, call = max_call > 15) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "a3_30_" ) %>%
mutate(prop = a3_30_TRUE / (a3_30_TRUE + a3_30_FALSE) )## # A tibble: 5 × 4
## # Groups: disease [5]
## disease a3_30_FALSE a3_30_TRUE prop
## <chr> <int> <int> <dbl>
## 1 ALS 186 805 0.812
## 2 ALS-FTD 21 48 0.696
## 3 Control 46 156 0.772
## 4 FTD 24 71 0.747
## 5 Other 52 191 0.786
a3_bar_plot <- a3_eh_meta %>%
filter(disease %in% c("ALS", "Control")) %>%
filter(!is.na(disease) ) %>%
#filter(max_call <= 30) %>%
#mutate(call_sum = call1 + call2 ) %>%
ggplot(aes(x = max_call)) + geom_histogram(aes(group = disease), binwidth = 1) +
facet_wrap(~disease_n, scales = "free_y", ncol = 1) +
theme_jh() +
labs(x = "ATXN3 repeat length", title = "ATXN3")
a3_dist_plot + a3_bar_plot + plot_layout(ncol = 2, widths = c(1,1))# tally
a3_tally <- a3_eh_meta %>%
filter(disease %in% c("Control", "ALS") ) %>%
group_by(disease, max_call) %>% tally() %>%
pivot_wider(names_from =disease, values_from = n, values_fill = 0 )
a3_tally$ALS_sum <- sum(a3_tally$ALS)
a3_tally$Control_sum <- sum(a3_tally$Control)
a3_tally$ALS_prop <- a3_tally$ALS / a3_tally$ALS_sum
a3_tally$Control_prop <- a3_tally$Control / a3_tally$Control_sum
a3_tally$p.value <- map_dbl(1:nrow(a3_tally), ~{
print(.x)
.x <- a3_tally[.x,]
prop.test(x = c(.x[,2, drop = TRUE], .x[,3, drop = TRUE]), n = c(.x[,4, drop = TRUE], .x[,5, drop = TRUE]))$p.value
})## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
a3_tally$padj <- p.adjust(a3_tally$p.value, method = "bonferroni")
plot_atxn3_eh_tally <-
a3_tally %>%
filter(max_call > 15 & max_call < 30) %>%
select(max_call, Control_prop, ALS_prop, p.value, padj) %>%
pivot_longer(cols = Control_prop:ALS_prop, values_to = "prop", names_to = "disease" ) %>%
mutate(disease = gsub("_prop", "", disease)) %>%
left_join(disease_n, by = "disease") %>%
ggplot(aes(x = max_call, y = prop)) + geom_col(aes(fill = disease_n), position = position_dodge(), width = 0.75 ) +
theme_jh() +
labs(x = "Maximum ATXN3 repeat length", y = "Proportion of group", fill = "Group") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0,0.26), expand = c(0,0)) +
geom_hline(yintercept = 0) +
geom_text(aes(label = signif(p.value, 2), y = 0.25 ), size =2 )
ggsave(plot = plot_atxn3_eh_tally, filename = here::here("ALS_SC/plots/ATXN3_WGS_EH_tally.pdf"), width =6, height = 3 )
# threshold at 16
a3_eh_meta %>%
filter(disease %in% c("Control", "ALS") ) %>%
group_by(disease, call = max_call >= 21) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "a3_" ) %>%
mutate(n = a3_FALSE + a3_TRUE) %>%
mutate(prop = a3_TRUE / (a3_TRUE + a3_FALSE) ) %>%
mutate(p.value = prop.test(x = .$a3_TRUE, n = .$n)$p.value )## # A tibble: 2 × 6
## # Groups: disease [2]
## disease a3_FALSE a3_TRUE n prop p.value
## <chr> <int> <int> <int> <dbl> <dbl>
## 1 ALS 677 314 991 0.317 0.148
## 2 Control 149 53 202 0.262 0.148
prop.test(x = c(314, 53), n = c(sum(a3_tally$ALS), sum(a3_tally$Control)) )##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(314, 53) out of c(sum(a3_tally$ALS), sum(a3_tally$Control))
## X-squared = 2.0891, df = 1, p-value = 0.1484
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.01573188 0.12468273
## sample estimates:
## prop 1 prop 2
## 0.3168517 0.2623762
# P = 0.36Compare with ATXN2
a2_eh_meta <- a3_eh_meta
max_atxn2 <- function(x){
df <- str_split_fixed(x, "/", 2)
lengths <- as.numeric(apply(df, MARGIN = 1, max))
return(lengths)
}
a2_eh_meta$atxn2_max <- max_atxn2(a2_eh_meta$atxn2_repeat_size)
a2_tally <- a2_eh_meta %>%
filter(disease %in% c("Control", "ALS") ) %>%
group_by(disease, atxn2_max) %>% tally() %>%
pivot_wider(names_from =disease, values_from = n, values_fill = 0 ) %>%
arrange(atxn2_max)
a2_tally$ALS_sum <- sum(a2_tally$ALS)
a2_tally$Control_sum <- sum(a2_tally$Control)
a2_tally$ALS_prop <- a2_tally$ALS / a2_tally$ALS_sum
a2_tally$Control_prop <- a2_tally$Control / a2_tally$Control_sum
a2_tally$p.value <- map_dbl(1:nrow(a2_tally), ~{
print(.x)
.x <- a2_tally[.x,]
prop.test(x = c(.x[,2, drop = TRUE], .x[,3, drop = TRUE]), n = c(.x[,4, drop = TRUE], .x[,5, drop = TRUE]))$p.value
})## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
a2_tally$padj <- p.adjust(a2_tally$p.value, method = "bonferroni")
a2_tally %>%
filter(atxn2_max > 24 & atxn2_max < 34 ) %>%
select(atxn2_max, Control_prop, ALS_prop, p.value, padj) %>%
pivot_longer(cols = Control_prop:ALS_prop, values_to = "prop", names_to = "disease" ) %>%
mutate(disease = gsub("_prop", "", disease)) %>%
left_join(disease_n, by = "disease") %>%
ggplot(aes(x = atxn2_max, y = prop)) + geom_col(aes(fill = disease_n), position = position_dodge(), width = 0.75 ) +
theme_jh() +
labs(title = "ATXN2", x = "repeat length") +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = 0) +
geom_text(aes(label = signif(p.value, 2), y = 0.2 ) )